library(tidyverse)
library(rvest)
library(urltools)
library(lubridate)

o_globo <- function(){
  base_url <- "https://acervo.oglobo.globo.com/busca/"
  # Função que, via Selenium com firefox, recupera lista de fundos da XP investimentos.
  
  get_referencias <- function (html) 
    html %>% html_nodes("figure > figcaption") %>% html_text()
  
  get_paginas <- function(html)
    html %>% html_nodes("li.page") %>% html_text()
  
  busca_texto <- function(exactword, allwords = "", anyword = "", noword = "", start, end, print_url = FALSE, only_preview = FALSE){
    # allwords <- "ministro ricardo lewandowski"
    # noword <- "mensalao"
    # start <- "01/08/2012"
    # end <- "17/10/2012"
    # exactword <- ""
    # anyword <- ""
    # print_url <- T
    # only_preview <- F
    parametros_fixos <- list(
      list("tipoConteudo", "pagina"),
      list("ordenacaoData", "dataAscendente"),
      list("economia", "on"),
      list("pais", "on"),
      list("opiniao", "on"),
      list("primeirapagina", "on"),
      list("segundapagina", "on"),
      list("exactword", URLencode(exactword, reserved = TRUE)),
      list("allwords", URLencode(allwords, reserved = TRUE)),
      list("anyword", URLencode(anyword, reserved = TRUE)),
      list("noword", URLencode(noword, reserved = TRUE))
    )
    
    data_inicio <- dmy(start)
    data_fim <- dmy(end)
    
    datas <- as_date(unique(sapply(data_inicio:data_fim, FUN = function(x) floor_date(as_date(x), "month"), simplify = T)))
    
    url <- reduce(parametros_fixos, function(url, param) param_set(url, param[[1]], param[[2]]), .init = base_url)
    
    res_por_data <- lapply(datas, function(data){
      # data <- datas[1]
      parametros_chamada <- list(
        list("decadaSelecionada", as.character(trunc((year(data)/10))*10)),
        list("anoSelecionado", as.character(year(data))),
        list("mesSelecionado", as.character(month(data))),
        list("pagina", "1")
      )
      url <- reduce(parametros_chamada, function(url, param) param_set(url, param[[1]], param[[2]]), .init = url)

      read_html_proxy <- function(url, tentativa = 1){
        res <- tryCatch({
          if (print_url) print(sprintf("Tentativa %i de ler [%s]", tentativa, url))
          read_html(url)
        }, error = function(e){
          print(sprintf("Erro lendo [%s] (tentativa %i)", url, tentativa))
          Sys.sleep(10)
          read_html("<html></html>")
        })
        if (length(res %>% html_nodes("body")) == 0 && tentativa < 9)
          res <- read_html_proxy(url, tentativa +1)
        else if (tentativa == 9){
          if (print_url) print(sprintf("Tentativa %i de ler [%s]", 10, url))
          res <- read_html(url)
        }
        res
      }

      html <- read_html_proxy(url)
      
      tex <- str_wrap(html %>% html_nodes("div.pre-resultado > p.resultados-small") %>% html_text())
      
      resumo <- list(total = ifelse(identical(tex, character(0)), 0,  as.integer(str_extract(tex, "^(\\d+)"))))

      if (!only_preview && !identical(tex, character(0))){
        
        paginas <- get_paginas(html)
        res = get_referencias(html)
        i <- 2
        while (as.character(i) %in% paginas){
          # i = 33
          url <- param_set(url, "pagina", i)
          print(sprintf("Obtendo página %i. Url: %s", i, url))
          html <- read_html_proxy(url)
          res <- c(res, get_referencias(html))
          paginas <- get_paginas(html)
          Sys.sleep(2)
          i <- i + 1
        }
        resumo$resultados <- res
      }
      resumo
    })
    
    list(
      total = sum(sapply(res_por_data, function(res) res$total, simplify  = T)),
      resultados = reduce(res_por_data, function(resultados, res) c(resultados, res$resultados), .init = c())
    )
  }
  
  list(
    get_base_url = function() base_url,
    busca_texto = busca_texto,
    gera_df = function(lista, qtd_col_name){
      df <- tibble(paginas = lista) %>%
        extract(col = paginas, into = c("data", "numero"), regex="(.*?)\\, .*?(\\d*)$") %>%
        mutate(data_edicao = dmy(data)) %>%
        mutate(semana = data_edicao - (wday(data_edicao) -1)) %>%
        group_by(semana) %>%
        summarize(qtd = n())
      colnames(df) <- c("Semana", qtd_col_name)
      df
    }
  )
}

folha <- function(){
  base_url <- "http://acervo.folha.com.br/busca.do"

  get_referencias <- function (html) 
    str_wrap(str_trim(html %>% html_nodes("a.edition > small") %>% html_text()))
  
  get_paginas <- function(html)
    html %>% html_nodes("a.page-number") %>% html_text()
  
  busca_texto <- function(criterio, start, end, i = 1, just_check = FALSE, show_url = FALSE){
    # criterio <- "exato:lava jato"
    # start <- "01/01/2014"
    # end <- "31/12/2018"
    # i <- 1
    parametros_fixos <- list(
      list("sort", "asc"),
      list("decadeStatus", ""),
      list("jornais", "1"),
      list("periododesc", ""),
      list("por", "Por+Per%edodo"),
      list("days", ""),
      list("month", ""),
      list("year", ""),
      list("keyword", URLencode(criterio, reserved = TRUE)),
      list("startDate", URLencode(start, reserved = TRUE)),
      list("endDate", URLencode(end, reserved = TRUE)),
      list("page", as.character(i))
    )
    
    url <- reduce(parametros_fixos, function(url, param) param_set(url, param[[1]], param[[2]]), .init = base_url)
    if (show_url) print(url)
    html <- read_html(url)
    tex <- html %>% html_nodes("div.results-tool-bar > span") %>% html_text()
    resumo <- list(descricao = tex, total = as.integer(str_extract(tex, "^(\\d+)")))
    res <- c()
    
    if (!just_check){
      paginas <- get_paginas(html)
      res = get_referencias(html)
      while (as.character(i) %in% paginas){
        # i = 33
        i <- i + 1
        print(sprintf("Obtendo página %i", i))
        html <- read_html(param_set(url, "page", as.character(i)))
        res <- c(res, get_referencias(html))
        paginas <- get_paginas(html)
        Sys.sleep(2)
      }
    } 
    resumo$resultados <- res
    resumo
  }
  
  list(
    get_base_url = function() base_url,
    busca_texto = busca_texto,
    gera_df = function(lista, qtd_col_name){
      df <- tibble(paginas = lista) %>%
        extract(col = paginas, into = c("data", "numero"), regex="(\\d\\d/\\d\\d/\\d\\d\\d\\d) .*? (\\d\\d.\\d\\d\\d).*") %>%
        mutate(data_edicao = dmy(data)) %>%
        mutate(semana = data_edicao - (wday(data_edicao) -1)) %>%
        group_by(semana) %>%
        summarize(qtd = n())
      
      colnames(df) <- c("Semana", qtd_col_name)
      df
    }
  )
}

